import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
data=pd.read_csv("dataset/cancer_reg.csv",encoding = "ISO-8859-1")
data.head()
| avgAnnCount | avgDeathsPerYear | TARGET_deathRate | incidenceRate | medIncome | popEst2015 | povertyPercent | studyPerCap | binnedInc | MedianAge | ... | PctPrivateCoverageAlone | PctEmpPrivCoverage | PctPublicCoverage | PctPublicCoverageAlone | PctWhite | PctBlack | PctAsian | PctOtherRace | PctMarriedHouseholds | BirthRate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1397.0 | 469 | 164.9 | 489.8 | 61898 | 260131 | 11.2 | 499.748204 | (61494.5, 125635] | 39.3 | ... | NaN | 41.6 | 32.9 | 14.0 | 81.780529 | 2.594728 | 4.821857 | 1.843479 | 52.856076 | 6.118831 |
| 1 | 173.0 | 70 | 161.3 | 411.6 | 48127 | 43269 | 18.6 | 23.111234 | (48021.6, 51046.4] | 33.0 | ... | 53.8 | 43.6 | 31.1 | 15.3 | 89.228509 | 0.969102 | 2.246233 | 3.741352 | 45.372500 | 4.333096 |
| 2 | 102.0 | 50 | 174.7 | 349.7 | 49348 | 21026 | 14.6 | 47.560164 | (48021.6, 51046.4] | 45.0 | ... | 43.5 | 34.9 | 42.1 | 21.1 | 90.922190 | 0.739673 | 0.465898 | 2.747358 | 54.444868 | 3.729488 |
| 3 | 427.0 | 202 | 194.8 | 430.4 | 44243 | 75882 | 17.1 | 342.637253 | (42724.4, 45201] | 42.8 | ... | 40.3 | 35.0 | 45.3 | 25.0 | 91.744686 | 0.782626 | 1.161359 | 1.362643 | 51.021514 | 4.603841 |
| 4 | 57.0 | 26 | 144.4 | 350.1 | 49955 | 10321 | 12.5 | 0.000000 | (48021.6, 51046.4] | 48.3 | ... | 43.9 | 35.1 | 44.0 | 22.7 | 94.104024 | 0.270192 | 0.665830 | 0.492135 | 54.027460 | 6.796657 |
5 rows × 34 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3047 entries, 0 to 3046 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 avgAnnCount 3047 non-null float64 1 avgDeathsPerYear 3047 non-null int64 2 TARGET_deathRate 3047 non-null float64 3 incidenceRate 3047 non-null float64 4 medIncome 3047 non-null int64 5 popEst2015 3047 non-null int64 6 povertyPercent 3047 non-null float64 7 studyPerCap 3047 non-null float64 8 binnedInc 3047 non-null object 9 MedianAge 3047 non-null float64 10 MedianAgeMale 3047 non-null float64 11 MedianAgeFemale 3047 non-null float64 12 Geography 3047 non-null object 13 AvgHouseholdSize 3047 non-null float64 14 PercentMarried 3047 non-null float64 15 PctNoHS18_24 3047 non-null float64 16 PctHS18_24 3047 non-null float64 17 PctSomeCol18_24 762 non-null float64 18 PctBachDeg18_24 3047 non-null float64 19 PctHS25_Over 3047 non-null float64 20 PctBachDeg25_Over 3047 non-null float64 21 PctEmployed16_Over 2895 non-null float64 22 PctUnemployed16_Over 3047 non-null float64 23 PctPrivateCoverage 3047 non-null float64 24 PctPrivateCoverageAlone 2438 non-null float64 25 PctEmpPrivCoverage 3047 non-null float64 26 PctPublicCoverage 3047 non-null float64 27 PctPublicCoverageAlone 3047 non-null float64 28 PctWhite 3047 non-null float64 29 PctBlack 3047 non-null float64 30 PctAsian 3047 non-null float64 31 PctOtherRace 3047 non-null float64 32 PctMarriedHouseholds 3047 non-null float64 33 BirthRate 3047 non-null float64 dtypes: float64(29), int64(3), object(2) memory usage: 809.5+ KB
data.describe(include='all')
| avgAnnCount | avgDeathsPerYear | TARGET_deathRate | incidenceRate | medIncome | popEst2015 | povertyPercent | studyPerCap | binnedInc | MedianAge | ... | PctPrivateCoverageAlone | PctEmpPrivCoverage | PctPublicCoverage | PctPublicCoverageAlone | PctWhite | PctBlack | PctAsian | PctOtherRace | PctMarriedHouseholds | BirthRate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3047.000000 | 3047.000000 | 3047.000000 | 3047.000000 | 3047.000000 | 3.047000e+03 | 3047.000000 | 3047.000000 | 3047 | 3047.000000 | ... | 2438.000000 | 3047.000000 | 3047.000000 | 3047.000000 | 3047.000000 | 3047.000000 | 3047.000000 | 3047.000000 | 3047.000000 | 3047.000000 |
| unique | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| top | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | (45201, 48021.6] | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 306 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | 606.338544 | 185.965868 | 178.664063 | 448.268586 | 47063.281917 | 1.026374e+05 | 16.878175 | 155.399415 | NaN | 45.272333 | ... | 48.453774 | 41.196324 | 36.252642 | 19.240072 | 83.645286 | 9.107978 | 1.253965 | 1.983523 | 51.243872 | 5.640306 |
| std | 1416.356223 | 504.134286 | 27.751511 | 54.560733 | 12040.090836 | 3.290592e+05 | 6.409087 | 529.628366 | NaN | 45.304480 | ... | 10.083006 | 9.447687 | 7.841741 | 6.113041 | 16.380025 | 14.534538 | 2.610276 | 3.517710 | 6.572814 | 1.985816 |
| min | 6.000000 | 3.000000 | 59.700000 | 201.300000 | 22640.000000 | 8.270000e+02 | 3.200000 | 0.000000 | NaN | 22.300000 | ... | 15.700000 | 13.500000 | 11.200000 | 2.600000 | 10.199155 | 0.000000 | 0.000000 | 0.000000 | 22.992490 | 0.000000 |
| 25% | 76.000000 | 28.000000 | 161.200000 | 420.300000 | 38882.500000 | 1.168400e+04 | 12.150000 | 0.000000 | NaN | 37.700000 | ... | 41.000000 | 34.500000 | 30.900000 | 14.850000 | 77.296180 | 0.620675 | 0.254199 | 0.295172 | 47.763063 | 4.521419 |
| 50% | 171.000000 | 61.000000 | 178.100000 | 453.549422 | 45207.000000 | 2.664300e+04 | 15.900000 | 0.000000 | NaN | 41.000000 | ... | 48.700000 | 41.100000 | 36.300000 | 18.800000 | 90.059774 | 2.247576 | 0.549812 | 0.826185 | 51.669941 | 5.381478 |
| 75% | 518.000000 | 149.000000 | 195.200000 | 480.850000 | 52492.000000 | 6.867100e+04 | 20.400000 | 83.650776 | NaN | 44.000000 | ... | 55.600000 | 47.700000 | 41.550000 | 23.100000 | 95.451693 | 10.509732 | 1.221037 | 2.177960 | 55.395132 | 6.493677 |
| max | 38150.000000 | 14010.000000 | 362.800000 | 1206.900000 | 125635.000000 | 1.017029e+07 | 47.400000 | 9762.308998 | NaN | 624.000000 | ... | 78.900000 | 70.700000 | 65.100000 | 46.600000 | 100.000000 | 85.947799 | 42.619425 | 41.930251 | 78.075397 | 21.326165 |
11 rows × 34 columns
missing=data.isnull()
missing.head()
| avgAnnCount | avgDeathsPerYear | TARGET_deathRate | incidenceRate | medIncome | popEst2015 | povertyPercent | studyPerCap | binnedInc | MedianAge | ... | PctPrivateCoverageAlone | PctEmpPrivCoverage | PctPublicCoverage | PctPublicCoverageAlone | PctWhite | PctBlack | PctAsian | PctOtherRace | PctMarriedHouseholds | BirthRate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | ... | True | False | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
5 rows × 34 columns
for column in missing.columns.values.tolist():
print(column)
print(missing[column].value_counts())
print("")
avgAnnCount False 3047 Name: avgAnnCount, dtype: int64 avgDeathsPerYear False 3047 Name: avgDeathsPerYear, dtype: int64 TARGET_deathRate False 3047 Name: TARGET_deathRate, dtype: int64 incidenceRate False 3047 Name: incidenceRate, dtype: int64 medIncome False 3047 Name: medIncome, dtype: int64 popEst2015 False 3047 Name: popEst2015, dtype: int64 povertyPercent False 3047 Name: povertyPercent, dtype: int64 studyPerCap False 3047 Name: studyPerCap, dtype: int64 binnedInc False 3047 Name: binnedInc, dtype: int64 MedianAge False 3047 Name: MedianAge, dtype: int64 MedianAgeMale False 3047 Name: MedianAgeMale, dtype: int64 MedianAgeFemale False 3047 Name: MedianAgeFemale, dtype: int64 Geography False 3047 Name: Geography, dtype: int64 AvgHouseholdSize False 3047 Name: AvgHouseholdSize, dtype: int64 PercentMarried False 3047 Name: PercentMarried, dtype: int64 PctNoHS18_24 False 3047 Name: PctNoHS18_24, dtype: int64 PctHS18_24 False 3047 Name: PctHS18_24, dtype: int64 PctSomeCol18_24 True 2285 False 762 Name: PctSomeCol18_24, dtype: int64 PctBachDeg18_24 False 3047 Name: PctBachDeg18_24, dtype: int64 PctHS25_Over False 3047 Name: PctHS25_Over, dtype: int64 PctBachDeg25_Over False 3047 Name: PctBachDeg25_Over, dtype: int64 PctEmployed16_Over False 2895 True 152 Name: PctEmployed16_Over, dtype: int64 PctUnemployed16_Over False 3047 Name: PctUnemployed16_Over, dtype: int64 PctPrivateCoverage False 3047 Name: PctPrivateCoverage, dtype: int64 PctPrivateCoverageAlone False 2438 True 609 Name: PctPrivateCoverageAlone, dtype: int64 PctEmpPrivCoverage False 3047 Name: PctEmpPrivCoverage, dtype: int64 PctPublicCoverage False 3047 Name: PctPublicCoverage, dtype: int64 PctPublicCoverageAlone False 3047 Name: PctPublicCoverageAlone, dtype: int64 PctWhite False 3047 Name: PctWhite, dtype: int64 PctBlack False 3047 Name: PctBlack, dtype: int64 PctAsian False 3047 Name: PctAsian, dtype: int64 PctOtherRace False 3047 Name: PctOtherRace, dtype: int64 PctMarriedHouseholds False 3047 Name: PctMarriedHouseholds, dtype: int64 BirthRate False 3047 Name: BirthRate, dtype: int64
Based on the summary above, each column has 3047 rows of data,
3 columns contain missing values
1.PctSomeCol18_24 : 2285
2.PctEmployed16_Over : 152
3.PctPrivateCoverageAlone : 609
PctSomeCol18_24 has 74% missing value
PctEmployed16_Over has 4% missing value
PctPrivateCoverageAlone has 19.98% missing value
so we can drop the PctSomeCol18_24 column and replace the missing value of PctEmployed16_Over, PctPrivateCoverageAlone with there mean
avg_PctEmp=data['PctEmployed16_Over'].astype('float').mean(axis=0)
print("avg loss of PctEmployed16_Over",avg_PctEmp)
avg_PctPri=data['PctPrivateCoverageAlone'].astype('float').mean(axis=0)
print("avg loss of PctPrivateCoverageAlone",avg_PctPri)
avg loss of PctEmployed16_Over 54.15264248704645 avg loss of PctPrivateCoverageAlone 48.45377358490559
data['PctEmployed16_Over'].replace(np.nan,avg_PctEmp,inplace=True)
data['PctPrivateCoverageAlone'].replace(np.nan,avg_PctEmp,inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3047 entries, 0 to 3046 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 avgAnnCount 3047 non-null float64 1 avgDeathsPerYear 3047 non-null int64 2 TARGET_deathRate 3047 non-null float64 3 incidenceRate 3047 non-null float64 4 medIncome 3047 non-null int64 5 popEst2015 3047 non-null int64 6 povertyPercent 3047 non-null float64 7 studyPerCap 3047 non-null float64 8 binnedInc 3047 non-null object 9 MedianAge 3047 non-null float64 10 MedianAgeMale 3047 non-null float64 11 MedianAgeFemale 3047 non-null float64 12 Geography 3047 non-null object 13 AvgHouseholdSize 3047 non-null float64 14 PercentMarried 3047 non-null float64 15 PctNoHS18_24 3047 non-null float64 16 PctHS18_24 3047 non-null float64 17 PctSomeCol18_24 762 non-null float64 18 PctBachDeg18_24 3047 non-null float64 19 PctHS25_Over 3047 non-null float64 20 PctBachDeg25_Over 3047 non-null float64 21 PctEmployed16_Over 3047 non-null float64 22 PctUnemployed16_Over 3047 non-null float64 23 PctPrivateCoverage 3047 non-null float64 24 PctPrivateCoverageAlone 3047 non-null float64 25 PctEmpPrivCoverage 3047 non-null float64 26 PctPublicCoverage 3047 non-null float64 27 PctPublicCoverageAlone 3047 non-null float64 28 PctWhite 3047 non-null float64 29 PctBlack 3047 non-null float64 30 PctAsian 3047 non-null float64 31 PctOtherRace 3047 non-null float64 32 PctMarriedHouseholds 3047 non-null float64 33 BirthRate 3047 non-null float64 dtypes: float64(29), int64(3), object(2) memory usage: 809.5+ KB
data.drop(['PctSomeCol18_24'],axis=1,inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3047 entries, 0 to 3046 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 avgAnnCount 3047 non-null float64 1 avgDeathsPerYear 3047 non-null int64 2 TARGET_deathRate 3047 non-null float64 3 incidenceRate 3047 non-null float64 4 medIncome 3047 non-null int64 5 popEst2015 3047 non-null int64 6 povertyPercent 3047 non-null float64 7 studyPerCap 3047 non-null float64 8 binnedInc 3047 non-null object 9 MedianAge 3047 non-null float64 10 MedianAgeMale 3047 non-null float64 11 MedianAgeFemale 3047 non-null float64 12 Geography 3047 non-null object 13 AvgHouseholdSize 3047 non-null float64 14 PercentMarried 3047 non-null float64 15 PctNoHS18_24 3047 non-null float64 16 PctHS18_24 3047 non-null float64 17 PctBachDeg18_24 3047 non-null float64 18 PctHS25_Over 3047 non-null float64 19 PctBachDeg25_Over 3047 non-null float64 20 PctEmployed16_Over 3047 non-null float64 21 PctUnemployed16_Over 3047 non-null float64 22 PctPrivateCoverage 3047 non-null float64 23 PctPrivateCoverageAlone 3047 non-null float64 24 PctEmpPrivCoverage 3047 non-null float64 25 PctPublicCoverage 3047 non-null float64 26 PctPublicCoverageAlone 3047 non-null float64 27 PctWhite 3047 non-null float64 28 PctBlack 3047 non-null float64 29 PctAsian 3047 non-null float64 30 PctOtherRace 3047 non-null float64 31 PctMarriedHouseholds 3047 non-null float64 32 BirthRate 3047 non-null float64 dtypes: float64(28), int64(3), object(2) memory usage: 785.7+ KB
sns.pairplot(data)
plt.show()